Testing for just one site:
# log-log, with cohort fixed effects: for just one site
lm_mod_log2_cohort <- lm(log(proportion) ~ 0 + log(time_1):cohort, data = filter(dat_l, site == "shaanxi"))
par(mfrow = c(2,2))
plot(lm_mod_log2_cohort)
## Warning: not plotting observations with leverage one:
## 350

summary(lm_mod_log2_cohort)
##
## Call:
## lm(formula = log(proportion) ~ 0 + log(time_1):cohort, data = filter(dat_l,
## site == "shaanxi"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.049496 -0.013912 -0.000122 0.017841 0.106898
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## log(time_1):cohort1988 -0.218069 0.002324 -93.844 < 2e-16 ***
## log(time_1):cohort1989 -0.223069 0.002404 -92.795 < 2e-16 ***
## log(time_1):cohort1990 -0.217128 0.002491 -87.176 < 2e-16 ***
## log(time_1):cohort1991 -0.254079 0.002585 -98.288 < 2e-16 ***
## log(time_1):cohort1992 -0.221388 0.002688 -82.360 < 2e-16 ***
## log(time_1):cohort1993 -0.290537 0.002801 -103.730 < 2e-16 ***
## log(time_1):cohort1994 -0.248796 0.002925 -85.053 < 2e-16 ***
## log(time_1):cohort1995 -0.195754 0.003063 -63.913 < 2e-16 ***
## log(time_1):cohort1996 -0.151948 0.003216 -47.247 < 2e-16 ***
## log(time_1):cohort1997 -0.144521 0.003388 -42.659 < 2e-16 ***
## log(time_1):cohort1998 -0.184442 0.003582 -51.493 < 2e-16 ***
## log(time_1):cohort1999 -0.141598 0.003803 -37.235 < 2e-16 ***
## log(time_1):cohort2000 -0.127973 0.004057 -31.545 < 2e-16 ***
## log(time_1):cohort2001 -0.088992 0.004352 -20.448 < 2e-16 ***
## log(time_1):cohort2002 -0.063893 0.004700 -13.594 < 2e-16 ***
## log(time_1):cohort2003 -0.068657 0.005116 -13.420 < 2e-16 ***
## log(time_1):cohort2004 -0.095691 0.005623 -17.018 < 2e-16 ***
## log(time_1):cohort2005 -0.091899 0.006254 -14.693 < 2e-16 ***
## log(time_1):cohort2006 -0.081600 0.007064 -11.552 < 2e-16 ***
## log(time_1):cohort2007 -0.080951 0.008139 -9.946 < 2e-16 ***
## log(time_1):cohort2008 -0.067471 0.009639 -7.000 1.46e-11 ***
## log(time_1):cohort2009 -0.052793 0.011875 -4.446 1.20e-05 ***
## log(time_1):cohort2010 -0.037733 0.015563 -2.424 0.0159 *
## log(time_1):cohort2011 -0.036645 0.022761 -1.610 0.1084
## log(time_1):cohort2012 -0.035781 0.042656 -0.839 0.4022
## log(time_1):cohort2013 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02957 on 326 degrees of freedom
## Multiple R-squared: 0.9956, Adjusted R-squared: 0.9953
## F-statistic: 2969 on 25 and 326 DF, p-value: < 2.2e-16
broom::tidy(lm_mod_log2_cohort)
broom::glance(lm_mod_log2_cohort)
formula(lm_mod_log2_cohort)
## log(proportion) ~ 0 + log(time_1):cohort
broom::augment(lm_mod_log2_cohort)
dev.off()
## null device
## 1
# run lms for each site individually, store in lists.
# log-log model, with cohort fixed effects
lm_log2_l <- lapply(1:11, FUN = function(i) {
lm(log(proportion) ~ 0 + log(time_1):cohort,
data = filter(dat_l, site == site_names[i]))
})
# log-log model, without cohort fixed effects
lm_log2_no_cohort_l <- lapply(1:11, FUN = function(i) {
lm(log(proportion) ~ 0 + log(time_1),
data = filter(dat_l, site == site_names[i]))
})
# log-log model, with year_abn (continuous) rather than the categorical cohort
lm_log2_year_abn_l <- lapply(1:11, FUN = function(i) {
lm(log(proportion) ~ 0 + log(time_1) + log(time_1):year_abn,
data = filter(dat_l, site == site_names[i]))
})
# lon-linear (exp. decay)
lm_log_lin_l <- lapply(1:11, FUN = function(i) {
lm(log(proportion) ~ 0 + time:cohort,
data = filter(dat_l, site == site_names[i]))
})
# linear
lm_0_l <- lapply(1:11, FUN = function(i) {
lm(log(proportion) ~ 0 + time,
data = filter(dat_l, site == site_names[i]))
})
names(lm_log2_l) <- site_names
names(lm_log2_no_cohort_l) <- site_names
names(lm_log2_year_abn_l) <- site_names
names(lm_log_lin_l) <- site_names
names(lm_0_l) <- site_names
# ----------------------- log-log cohort plot ----------------------- #
# plot with age on the x-axis
ggplot(mapping = aes(x = age, y = proportion, group = year_abn, color = year_abn)) +
geom_point(data = filter(dat_l, site == "shaanxi")) +
scale_color_distiller(palette = "Greens") +
scale_x_continuous(n.breaks = 30) +
labs(title = "Decay of abandoned land over time", #subtitle = site_df$description[indx],
caption = "lm formula: log(proportion) ~ 0 + log(time_1):cohort",
x = "Time since initial abandonment (years)", y = "Proportion of abandoned land remaining",
color = "Cohort (Year \nFirst Abandoned)") +
# add the lm curves, with predictions (this works because the "newdata" df has "age" in it, as well as "time_1")
geom_line(data = mutate(augment(lm_mod_log2_cohort, newdata = filter(dat_l_0, site == "shaanxi")),
proportion = exp(.fitted))) +
# add the mean coefficient (needs to be transformed 4 to the right, so that plotting works)
geom_function(fun = function(x) { (x-4) ^ (mean(coef(lm_log2_l$shaanxi), na.rm = TRUE))},
mapping = aes(linetype = "Mean Decay Rate"),
color = "blue", size = 1, #linetype = "dashed",
inherit.aes = FALSE) +
scale_linetype(name = "")
## Warning in predict.lm(x, newdata = newdata, na.action = na.pass, ...):
## prediction from a rank-deficient fit may be misleading

# ----------------------------- #
# plot with time_1 on the x-axis
ggplot(data = filter(dat_l, site == "shaanxi"),
mapping = aes(x = time_1, y = proportion, group = year_abn, color = year_abn)) +
geom_point() + scale_color_distiller(palette = "Greens") +
geom_line(data = mutate(augment(lm_mod_log2_cohort), # or augment(lm_log2_l$shaanxi)
time_1 = exp(`log(time_1)`),
proportion = exp(.fitted),
year_abn = as.numeric(levels(cohort))[cohort])) +
scale_x_continuous(n.breaks = 30) +
# the mean coefficient
geom_function(fun = function(x) { x ^ mean(coef(lm_mod_log2_cohort), na.rm = TRUE)},
size = 1, linetype = "dashed", inherit.aes = FALSE)

your_path <- "/Users/christophercrawford/Desktop/test/"
# ----------------------- diagnostics plot, for all 11 sites in sequence ----------------------- #
lapply(1:11, FUN = function(i) {
png(filename = paste0(your_path, "log2_cohort_diag_", names(lm_log2_l[i]),".png"),
width = 8, height = 8, units = "in", res = 400)
par(mfrow = c(2,2))
plot(lm_log2_l[[i]], 1, main = site_names[i])
plot(lm_log2_l[[i]], 2)
plot(lm_log2_l[[i]], 3)
plot(lm_log2_l[[i]], 4)
dev.off()
})
## Warning: not plotting observations with leverage one:
## 350
## Warning: not plotting observations with leverage one:
## 350
## Warning: not plotting observations with leverage one:
## 350
## Warning: not plotting observations with leverage one:
## 350
## Warning: not plotting observations with leverage one:
## 350
## Warning: not plotting observations with leverage one:
## 350
## Warning: not plotting observations with leverage one:
## 350
## Warning: not plotting observations with leverage one:
## 350
## Warning: not plotting observations with leverage one:
## 350
## Warning: not plotting observations with leverage one:
## 350
## Warning: not plotting observations with leverage one:
## 350
## Warning: not plotting observations with leverage one:
## 350
## Warning: not plotting observations with leverage one:
## 350
## Warning: not plotting observations with leverage one:
## 350
## Warning: not plotting observations with leverage one:
## 350
## Warning: not plotting observations with leverage one:
## 350
## Warning: not plotting observations with leverage one:
## 350
## Warning: not plotting observations with leverage one:
## 350
## Warning: not plotting observations with leverage one:
## 350
## Warning: not plotting observations with leverage one:
## 350
## Warning: not plotting observations with leverage one:
## 350
## Warning: not plotting observations with leverage one:
## 350
## [[1]]
## quartz_off_screen
## 2
##
## [[2]]
## quartz_off_screen
## 2
##
## [[3]]
## quartz_off_screen
## 2
##
## [[4]]
## quartz_off_screen
## 2
##
## [[5]]
## quartz_off_screen
## 2
##
## [[6]]
## quartz_off_screen
## 2
##
## [[7]]
## quartz_off_screen
## 2
##
## [[8]]
## quartz_off_screen
## 2
##
## [[9]]
## quartz_off_screen
## 2
##
## [[10]]
## quartz_off_screen
## 2
##
## [[11]]
## quartz_off_screen
## 2
# ----------------------- fitted vs. residuals diag plot, all 11 sites in one figure ----------------------- #
png(filename = paste0(your_path, "log2_cohort_diag_resid_v_fitted.png"),
width = 9, height = 8, units = "in", res = 400)
par(mfrow = c(3, 4))
for (i in 1:11) plot(lm_log2_l[[i]], 1, main = site_names[i])
dev.off()
## quartz_off_screen
## 2
# construct a model of *all* sites, with cohort and site fixed effects
str(dat_l)
## tibble [3,861 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ site : Factor w/ 11 levels "belarus","bosnia_herzegovina",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : num [1:3861] 1992 1993 1993 1994 1994 ...
## $ year_abn : num [1:3861] 1988 1988 1989 1988 1989 ...
## $ count : num [1:3861] 2055644 1659519 1260207 1455200 1067262 ...
## $ area_ha : num [1:3861] 106770 86195 65455 75583 55433 ...
## $ age : num [1:3861] 5 6 5 7 6 5 8 7 6 5 ...
## $ bins : Factor w/ 5 levels "10 to 15 years",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ proportion : num [1:3861] 1 0.807 1 0.708 0.847 ...
## $ initial_area_abn: num [1:3861] 106770 106770 65455 106770 65455 ...
## $ year_abn_bins : Factor w/ 5 levels "(1993,1998]",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ time : num [1:3861] 0 1 0 2 1 0 3 2 1 0 ...
## $ time_1 : num [1:3861] 1 2 1 3 2 1 4 3 2 1 ...
## $ cohort : Factor w/ 26 levels "1988","1989",..: 1 1 2 1 2 3 1 2 3 4 ...
## - attr(*, "spec")=
## .. cols(
## .. site = col_character(),
## .. year = col_double(),
## .. year_abn = col_double(),
## .. count = col_double(),
## .. area_ha = col_double(),
## .. age = col_double(),
## .. bins = col_character(),
## .. proportion = col_double(),
## .. initial_area_abn = col_double(),
## .. year_abn_bins = col_character(),
## .. time = col_double(),
## .. time_1 = col_double(),
## .. cohort = col_double()
## .. )
# log-log model, with cohort and site fixed effects
lm_log2_cohort_all_site <- lm(log(proportion) ~ 0 + log(time_1):cohort:site, data = dat_l)
# ------------------------------------------------ #
# diagnostic plots
# ------------------------------------------------ #
par(mfrow = c(2,2))
plot(lm_log2_cohort_all_site, main = "Model with all sites, all cohorts")
## Warning: not plotting observations with leverage one:
## 350, 701, 1052, 1403, 1754, 2105, 2456, 2807, 3158, 3509, 3860

# now, without the outlier
par(mfrow = c(2,2))
plot(lm(log(proportion) ~ 0 + log(time_1):cohort:site, data = dat_l[-2092, ]), main = "Model with all sites, all cohorts")
## Warning: not plotting observations with leverage one:
## 350, 701, 1052, 1403, 1754, 2104, 2455, 2806, 3157, 3508, 3859

# now with a log(time) term and a linear time term:
lm_mod_log2_cohort_plus <- lm(log(proportion) ~ 0 + log(time_1):cohort + time_1:cohort, data = filter(dat_l, site == "shaanxi"))
ggplot(data = filter(dat_l, site == "shaanxi"),
mapping = aes(x = time_1, y = proportion, group = year_abn, color = year_abn)) +
geom_point() + scale_color_distiller(palette = "Greens") +
geom_line(data = mutate(augment(lm_mod_log2_cohort_plus), # or augment(lm_log2_l$shaanxi)
time_1 = exp(`log(time_1)`),
proportion = exp(.fitted),
year_abn = as.numeric(levels(cohort))[cohort])) +
scale_x_continuous(n.breaks = 30)

par(mfrow = c(2, 2))
plot(lm_mod_log2_cohort_plus)
## Warning: not plotting observations with leverage one:
## 325, 350, 351
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

# residuals vs. covariates
lm_mod_log2_cohort
##
## Call:
## lm(formula = log(proportion) ~ 0 + log(time_1):cohort, data = filter(dat_l,
## site == "shaanxi"))
##
## Coefficients:
## log(time_1):cohort1988 log(time_1):cohort1989 log(time_1):cohort1990
## -0.21807 -0.22307 -0.21713
## log(time_1):cohort1991 log(time_1):cohort1992 log(time_1):cohort1993
## -0.25408 -0.22139 -0.29054
## log(time_1):cohort1994 log(time_1):cohort1995 log(time_1):cohort1996
## -0.24880 -0.19575 -0.15195
## log(time_1):cohort1997 log(time_1):cohort1998 log(time_1):cohort1999
## -0.14452 -0.18444 -0.14160
## log(time_1):cohort2000 log(time_1):cohort2001 log(time_1):cohort2002
## -0.12797 -0.08899 -0.06389
## log(time_1):cohort2003 log(time_1):cohort2004 log(time_1):cohort2005
## -0.06866 -0.09569 -0.09190
## log(time_1):cohort2006 log(time_1):cohort2007 log(time_1):cohort2008
## -0.08160 -0.08095 -0.06747
## log(time_1):cohort2009 log(time_1):cohort2010 log(time_1):cohort2011
## -0.05279 -0.03773 -0.03664
## log(time_1):cohort2012 log(time_1):cohort2013
## -0.03578 NA
plot(lm_mod_log2_cohort)
## Warning: not plotting observations with leverage one:
## 350




augment(lm_mod_log2_cohort) %>% arrange(desc(.hat))
# just one site
resid_df1 <- bind_cols(augment(lm_mod_log2_cohort), # with the tester model from above
filter(dat_l, site == "shaanxi"))
## New names:
## * cohort -> cohort...3
## * cohort -> cohort...21
# all sites individually, combined
resid_df <- lapply(lm_log2_l, augment) %>% bind_rows(.id = "site") %>%
bind_cols(., dat_l)
## New names:
## * site -> site...1
## * cohort -> cohort...4
## * site -> site...10
## * cohort -> cohort...22
# residuals from the full model including all sites
resid_df_all <- bind_cols(dat_l,
augment(lm_log2_cohort_all_site))
## New names:
## * site -> site...1
## * cohort -> cohort...13
## * cohort -> cohort...16
## * site -> site...17
# plot for just shaanxi
str(resid_df)
## tibble [3,861 × 22] (S3: tbl_df/tbl/data.frame)
## $ site...1 : chr [1:3861] "belarus" "belarus" "belarus" "belarus" ...
## $ log(proportion) : num [1:3861] 0 -0.214 0 -0.345 -0.166 ...
## $ log(time_1) : num [1:3861] 0 0.693 0 1.099 0.693 ...
## $ cohort...4 : Factor w/ 26 levels "1988","1989",..: 1 1 2 1 2 3 1 2 3 4 ...
## $ .fitted : num [1:3861] 0 -0.257 0 -0.407 -0.249 ...
## $ .std.resid : num [1:3861] 4.03e-14 1.39 1.06e-14 2.00 2.68 ...
## $ .hat : num [1:3861] 0 0.00297 0 0.00746 0.00318 ...
## $ .sigma : num [1:3861] 0.0311 0.031 0.0311 0.0309 0.0308 ...
## $ .cooksd : num [1:3861] 0 0.000229 0 0.001207 0.000914 ...
## $ site...10 : Factor w/ 11 levels "belarus","bosnia_herzegovina",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : num [1:3861] 1992 1993 1993 1994 1994 ...
## $ year_abn : num [1:3861] 1988 1988 1989 1988 1989 ...
## $ count : num [1:3861] 2055644 1659519 1260207 1455200 1067262 ...
## $ area_ha : num [1:3861] 106770 86195 65455 75583 55433 ...
## $ age : num [1:3861] 5 6 5 7 6 5 8 7 6 5 ...
## $ bins : Factor w/ 5 levels "10 to 15 years",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ proportion : num [1:3861] 1 0.807 1 0.708 0.847 ...
## $ initial_area_abn: num [1:3861] 106770 106770 65455 106770 65455 ...
## $ year_abn_bins : Factor w/ 5 levels "(1993,1998]",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ time : num [1:3861] 0 1 0 2 1 0 3 2 1 0 ...
## $ time_1 : num [1:3861] 1 2 1 3 2 1 4 3 2 1 ...
## $ cohort...22 : Factor w/ 26 levels "1988","1989",..: 1 1 2 1 2 3 1 2 3 4 ...
## - attr(*, "terms")=Classes 'terms', 'formula' language log(proportion) ~ 0 + log(time_1):cohort
## .. ..- attr(*, "variables")= language list(log(proportion), log(time_1), cohort)
## .. ..- attr(*, "factors")= int [1:3, 1] 0 2 2
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:3] "log(proportion)" "log(time_1)" "cohort"
## .. .. .. ..$ : chr "log(time_1):cohort"
## .. ..- attr(*, "term.labels")= chr "log(time_1):cohort"
## .. ..- attr(*, "order")= int 2
## .. ..- attr(*, "intercept")= int 0
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: 0x7f7eeab92240>
## .. ..- attr(*, "predvars")= language list(log(proportion), log(time_1), cohort)
## .. ..- attr(*, "dataClasses")= Named chr [1:3] "numeric" "numeric" "factor"
## .. .. ..- attr(*, "names")= chr [1:3] "log(proportion)" "log(time_1)" "cohort"
gg_resid_time <- ggplot(data = resid_df,
aes(x = time, y = .std.resid, color = year_abn)) +
geom_point() +
scale_color_distiller(palette = "Greens") +
labs(y = "Residuals", x = "Time Past Abandonment Threshold")
gg_resid_area <- ggplot(data = resid_df,
aes(x = initial_area_abn, y = .std.resid, color = year_abn)) +
geom_point(position = position_jitter(width = 20)) +
scale_color_distiller(palette = "Greens") +
labs(y = "Residuals", x = "Initial Area Abandoned") +
theme(legend.position = "none")
#install.packages("cowplot")
library(cowplot)
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
# just shaanxi
cowplot::plot_grid(gg_resid_time, gg_resid_area, rel_widths = c(1.2, 1))
## Warning: Removed 11 rows containing missing values (geom_point).
## Warning: Removed 11 rows containing missing values (geom_point).

# all sites
cowplot::plot_grid(
gg_resid_time %+% resid_df[-2092, ], # without outlier
gg_resid_area %+% resid_df[-2092, ], # without outlier
rel_widths = c(1.2, 1))
## Warning: Removed 11 rows containing missing values (geom_point).
## Warning: Removed 11 rows containing missing values (geom_point).

# residuals from the model with all sites:
cowplot::plot_grid(
gg_resid_time %+% resid_df_all[-2092, ], # without outlier
gg_resid_area %+% resid_df_all[-2092, ], # without outlier
rel_widths = c(1.2, 1))
## Warning: Removed 11 rows containing missing values (geom_point).
## Warning: Removed 11 rows containing missing values (geom_point).

# to plot specific sites:
cowplot::plot_grid(
gg_resid_time %+% filter(resid_df[-2092, ], site...1 == "shaanxi"), # without outlier
gg_resid_area %+% filter(resid_df[-2092, ], site...1 == "shaanxi"), # without outlier
rel_widths = c(1.2, 1))
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
